'''
    This program will build the pay dataset incorporating in workers' educational
    histories from their resumes. This will produce the main dataset for all of the
    analysis.
'''

print("Starting R_create_salaries_international.")

import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import csv
import statsmodels.api as sm
import statsmodels.formula.api as smf
from datetime import datetime
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import gc


gdSeed = "G:/Shared drives/Economic Research/Research/zzz - Jason Sockin Projects/DataGrab"
fileSeed = "C:/Users/jsock/Dropbox/Research/GD/International"

dataPath = gdSeed + "/Salaries/"
savePath = fileSeed + "/Data/"
inputPath = fileSeed + "/InputData/"

##############################################################################
# Clean between gravity data for merging later
##############################################################################

gravityDF = pd.read_csv(savePath + 'Gravity_V202211.csv')

gravityDF = gravityDF[gravityDF.iso3_o.notnull()]

gravityDF = gravityDF[gravityDF.iso3_d.notnull()]

gravityDF = gravityDF[gravityDF.dist.notnull()]

gravityDF = gravityDF[gravityDF.contig.notnull()]

gravityDF = gravityDF[gravityDF.comlang_off.notnull()]

gravityDF = gravityDF[gravityDF.sibling_ever.notnull()]

gravityDF = gravityDF[gravityDF.col_dep_ever.notnull()]

gravityDF = gravityDF[['iso3_o','iso3_d','dist','contig','comlang_off','sibling_ever','col_dep_ever']]

gravityDF = gravityDF.groupby(['iso3_o','iso3_d']).head(1)

gravityDF.to_csv(savePath + 'Gravity_origin_destination.csv')

##############################################################################
# Read in full salary data and clean
##############################################################################

df = pd.read_csv(dataPath + 'International_salaries_07_13_2022.csv')

df = df.groupby(['FK_userId','FK_employerId','countryName','yearOfSalary']).head(1)

df = df.reset_index()
del df['index']

# Clean dataset
df = df[(df.yearOfSalary>=2006) & (df.yearOfSalary<=2022)]
df = df[df.employmentStatusCode=='REGULAR']
df = df[(df.yearsOfRelevantExpNumber>=0) & (df.yearsOfRelevantExpNumber<=50)]
df = df[df.baseSalary>0]

# Create additional vars
df['salID'] = df.index
df['hasBonus'] = 1 * ((df.cashBonusAmount > 0) | (df.stockBonusAmount > 0) | (df.profitSharingAmount > 0) | (df.salesCommissionAmount > 0))

# Extract salary date 
df['dateVal'] = df.reviewDateTime.apply(lambda x: str(x).split(' ')[0])

# Send unknown gender to single grouping
df.loc[(df.gender!='MALE') & (df.gender!='FEMALE'),'gender'] = 'UNKNOWN'

df = df[df.FK_userId!=-1]

del df['Unnamed: 0']
del df['industryName']
del df['name']
del df['reviewDateTime']

##############################################################################
# Keep only through 2022:Q2
##############################################################################

df['submitYear'] = df.dateVal.apply(lambda x: int(str(x).split('-')[0]))
df['submitMonth'] = df.dateVal.apply(lambda x: int(str(x).split('-')[1]))

df = df[ ~( (df.submitYear==2022) & (df.submitMonth>=7) )]

del df['submitYear']
del df['submitMonth']

##############################################################################
# Drop salaries that are not in the main currency for the country 
##############################################################################

#-------------------------------------
# Incidence of each country-currency pair
#-------------------------------------

countryCurrencySize = df.groupby(['countryName','baseCurrency']).yearOfSalary.size().reset_index()
countryCurrencySize.rename(columns={'yearOfSalary':'countryCurrencySize'},inplace=True)

countrySize = df.groupby(['countryName']).yearOfSalary.size().reset_index()
countrySize.rename(columns={'yearOfSalary':'countrySize'},inplace=True)

df = pd.merge(df, countryCurrencySize, how='left', left_on = ['countryName','baseCurrency'], right_on = ['countryName','baseCurrency'])
df = pd.merge(df, countrySize, how='left', left_on = ['countryName'], right_on = ['countryName'])

df['countryCurrencyShare'] = df.countryCurrencySize / df.countrySize

del df['countryCurrencySize']
del df['countrySize']

#-------------------------------------
# Use main currency
#-------------------------------------

mainCurrency = pd.read_csv(inputPath + "Country_currencies.csv",engine='python',encoding='latin-1')

mainCurrency.rename(columns={'Country or territory':'countryName'},inplace=True)
mainCurrency.rename(columns={'baseCurrency':'mainCurrency'},inplace=True)
del mainCurrency['Currency']

mainCurrency.loc[mainCurrency.countryName=='United States of America','countryName'] = 'United States'
mainCurrency.loc[mainCurrency.countryName=='Czechia','countryName'] = 'Czech Republic'
mainCurrency.loc[mainCurrency.countryName=='Hong Kong (China)','countryName'] = 'Hong Kong'

df = pd.merge(df, mainCurrency, how='left', left_on = ['countryName'], right_on = ['countryName'])

z = df[df.mainCurrency.isnull()].groupby('countryName').size().reset_index()

df = df[df.mainCurrency==df.baseCurrency]

del df['mainCurrency']

##############################################################################
# Add education information from resumes
##############################################################################

gc.collect()

# Read in domestic and international schools on resumes
resumes = pd.read_csv(savePath + "Cleaned_user_schools_majors.csv")
del resumes['Unnamed: 0']

resumes = resumes[resumes.school.notnull()]

df = pd.merge(df, resumes, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])

resumesSchools = df.groupby('school').size().reset_index()
 
del resumes

##############################################################################
# Look at sample size for international coverage
##############################################################################

#------------------------------------------
# Worker country count 
#------------------------------------------

workerCountries = df.groupby(['FK_userId','countryName']).head(1).groupby(['FK_userId']).basePayPeriodCode.size().reset_index()
workerCountries.rename(columns={'basePayPeriodCode':'workerCountries'},inplace=True)

df = pd.merge(df, workerCountries, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])

del workerCountries

gc.collect()

##############################################################################
# Determine if salaries should be dropped for censoring
##############################################################################

#-----------------------------
# Convert bonus values to annual 
#   - if not reported annually, kill to 0.001 for incidence but not regressions
#-----------------------------

df['cashBonusYearly'] = 0
df['currValid'] = ((df.cashBonusPayPeriodId==1) | (df.dateVal < '2012-09-09')) & (df.cashBonusAmount > 0)
df.loc[df.currValid==1,'cashBonusYearly'] = df[df.currValid==1].cashBonusAmount
df['currValid'] = (df.cashBonusPayPeriodId==3) & (df.cashBonusAmount > 0)
df.loc[df.currValid==1,'cashBonusYearly'] = 0.001
del df['currValid']

df['stockBonusYearly'] = 0
df['currValid'] = ((df.stockBonusPayPeriodId==1) | (df.dateVal < '2012-09-09')) & (df.stockBonusAmount > 0)
df.loc[df.currValid==1,'stockBonusYearly'] = df[df.currValid==1].stockBonusAmount
df['currValid'] = (df.stockBonusPayPeriodId==3) & (df.stockBonusAmount > 0)
df.loc[df.currValid==1,'stockBonusYearly'] = 0.001
del df['currValid']

df['profitSharingYearly'] = 0
df['currValid'] = ((df.profitSharingPayPeriodId==1) | (df.dateVal < '2012-09-09')) & (df.profitSharingAmount > 0)
df.loc[df.currValid==1,'profitSharingYearly'] = df[df.currValid==1].profitSharingAmount
df['currValid'] = (df.profitSharingPayPeriodId==3) & (df.profitSharingAmount > 0)
df.loc[df.currValid==1,'profitSharingYearly'] = 0.001
del df['currValid']

df['salesCommissionYearly'] = 0
df['currValid'] = ((df.salesCommissionPayPeriodId==1) | (df.dateVal < '2012-09-09')) & (df.salesCommissionAmount > 0)
df.loc[df.currValid==1,'salesCommissionYearly'] = df[df.currValid==1].salesCommissionAmount
df['currValid'] = (df.salesCommissionPayPeriodId==3) & (df.salesCommissionAmount > 0)
df.loc[df.currValid==1,'salesCommissionYearly'] = 0.001
del df['currValid']

df['tipsYearly'] = 0
df['currValid'] = ((df.tipsPayPeriodId==1) | (df.dateVal < '2012-09-09')) & (df.tipsAmount > 0)
df.loc[df.currValid==1,'tipsYearly'] = df[df.currValid==1].tipsAmount
df['currValid'] = (df.tipsPayPeriodId==3) & (df.tipsAmount > 0)
df.loc[df.currValid==1,'tipsYearly'] = 0.001
del df['currValid']

df['dropNotAnnual'] = ((df.cashBonusYearly == 0.001) | (df.stockBonusYearly == 0.001) | (df.profitSharingYearly == 0.001) | (df.salesCommissionYearly == 0.001) | (df.tipsYearly == 0.001)) * 1    

#-----------------------------
# Clean the dataset 
#-----------------------------

del df['cashBonusPayPeriodId']
del df['stockBonusPayPeriodId']
del df['profitSharingPayPeriodId']
del df['salesCommissionPayPeriodId']
del df['employmentStatusCode']
del df['tipsPayPeriodId']

df.rename(columns={'uniquename':'metro'},inplace=True)
df.rename(columns={'stateAbbreviation':'state'},inplace=True)

##############################################################################
# Add exchange rates from Todd
##############################################################################

exchangeRates = pd.read_csv(inputPath + "Exchange_rates_2022.csv")

exchangeRates.rename(columns={'year':'yearOfSalary'},inplace=True)
exchangeRates.rename(columns={'country_glassdoor':'countryName'},inplace=True)
exchangeRates.rename(columns={'iso':'country_iso'},inplace=True)

exchangeRates['fill2022'] = exchangeRates.groupby('country').gdppw.shift(1) 

exchangeRates.loc[exchangeRates.yearOfSalary==2022,'gdppw'] = exchangeRates[exchangeRates.yearOfSalary==2022]['fill2022']

del exchangeRates['fill2022']
del exchangeRates['country']

df = pd.merge(df, exchangeRates, how='left', left_on = ['countryName','yearOfSalary'], right_on = ['countryName','yearOfSalary'])

##############################################################################
# Add country names to abbreviations
##############################################################################

print("Read in country names.")

countryID = pd.read_csv(inputPath + "Country_identifiers.csv",encoding='cp1252')

countryID['Alpha2'] = countryID.Alpha2.apply(lambda x: str(x).lower())

# Clean up country names for alignment with Glassdoor
countryID['universityCountry'] = countryID['Country']
countryID.loc[countryID.Country=='Iran (Islamic Republic of)','universityCountry'] = 'Iran'
countryID.loc[countryID.Country=='Netherlands (the)','universityCountry'] = 'Netherlands'
countryID.loc[countryID.Country=='Philippines (the)','universityCountry'] = 'Philippines'
countryID.loc[countryID.Country=='Russian Federation (the)','universityCountry'] = 'Russia'
countryID.loc[countryID.Country=="Korea (the Republic of)",'universityCountry'] = 'South Korea'
countryID.loc[countryID.Country=='Taiwan (Province of China)','universityCountry'] = 'Taiwan'
countryID.loc[countryID.Country=='United Arab Emirates (the)','universityCountry'] = 'United Arab Emirates'
countryID.loc[countryID.Country=='United Kingdom of Great Britain and Northern Ireland (the)','universityCountry'] = 'United Kingdom'
countryID.loc[countryID.Country=='United States of America (the)','universityCountry'] = 'United States'
countryID.loc[countryID.Country=='Viet Nam','universityCountry'] = 'Vietnam'
countryID.loc[countryID.Country=='Czechia','universityCountry'] = 'Czech Republic'
countryID.loc[countryID.Country=='Venezuela (Bolivarian Republic of)','universityCountry'] = 'Venezuela'
countryID.loc[countryID.Country=='Palestine, State of','universityCountry'] = 'Palestine'
countryID.loc[countryID.Country=='Dominican Republic (the)','universityCountry'] = 'Dominican Republic'
countryID.loc[countryID.Country=='Tanzania, United Republic of','universityCountry'] = 'Tanzania'

countryID = countryID[['universityCountry','Alpha2']]

countryID.rename(columns={'Alpha2':'Nation'},inplace=True)

df = pd.merge(df, countryID, how='left', left_on = ['Nation'], right_on = ['Nation'])

countryID.rename(columns={'Nation':'Nation_2'},inplace=True)
countryID.rename(columns={'universityCountry':'universityCountry_2'},inplace=True)

df = pd.merge(df, countryID, how='left', left_on = ['Nation_2'], right_on = ['Nation_2'])

##############################################################################
# Calculate share of workers with resumes (resumes only pulled up until 2021) then drop everyone else
##############################################################################

df['shareWithResume'] = df[df.yearOfSalary<=2021].groupby('FK_userId').head(1).FK_resumeId.notnull().mean()

#dfTrim = df[(df.workerCountries>1) | (df.FK_resumeId.notnull())]
#df = df[df.FK_resumeId.notnull()]

# Save with missing
df = df[(df.degree=='missing') | (df.degree=='BACHELORS') | (df.degree=='MASTERS') | (df.degree=='JD') | (df.degree=='MBA') | (df.degree=='PHD') | (df.degree=='POSTGRAD')]
df.to_csv(savePath + 'Salaries_international_dataset.csv')

# Drop missing degree
df = df[(df.degree=='BACHELORS') | (df.degree=='MASTERS') | (df.degree=='JD') | (df.degree=='MBA') | (df.degree=='PHD') | (df.degree=='POSTGRAD')]

##############################################################################
# Drop if worker has both genders for different salaries
##############################################################################

df['isFemale'] = 1 * (df.gender == 'FEMALE')
isFemale = pd.DataFrame(df.groupby(['FK_userId']).isFemale.max()).reset_index()

df['isMale'] = 1 * (df.gender == 'MALE')
isMale = pd.DataFrame(df.groupby(['FK_userId']).isMale.max()).reset_index()

df['isUnknown'] = 1 * (df.gender == 'UNKNOWN')
isUnknown = pd.DataFrame(df.groupby(['FK_userId']).isUnknown.max()).reset_index()

del df['isFemale']
del df['isMale']
del df['isUnknown']

df = pd.merge(df, isFemale, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])
df = pd.merge(df, isMale, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])
df = pd.merge(df, isUnknown, how='left', left_on = ['FK_userId'], right_on = ['FK_userId'])

df = df[~ ( (df.isMale==1) & (df.isFemale==1) )]

df.loc[(df.isFemale==1) & (df.isUnknown==1),'gender'] = 'FEMALE'
df.loc[(df.isMale==1) & (df.isUnknown==1),'gender'] = 'MALE'

del df['isFemale']
del df['isMale']
del df['isUnknown']

userGenders = pd.DataFrame(df.groupby(['FK_userId','gender']).head(1).groupby(['FK_userId']).size()).reset_index()
userGenders.rename(columns={0:'userGenders'},inplace=True)
userGenders = userGenders[userGenders['userGenders']>1]

del isFemale
del isMale
del isUnknown
del userGenders

##############################################################################
# Merge in CWUR rankings
##############################################################################

#-----------------------------
# Read in CWUR rankings
#-----------------------------

print("Read in cwur rankings.")

cwurNames = pd.read_csv(inputPath + "CWUR_world_rankings_2019_2020.csv",encoding='cp1252')

# Clean up country names for alignment with Glassdoor
cwurNames.loc[cwurNames.Location=='USA','Location'] = 'United States'
cwurNames.loc[cwurNames.Location=='Slovak Republic','Location'] = 'Slovakia'

cwurNames.rename(columns={'Institution':'school'},inplace=True)
cwurNames.rename(columns={'Location':'universityCountry'},inplace=True)

cwurNames = cwurNames[cwurNames.school.notnull()]

#-----------------------------
# Determine matches and non-matches
#-----------------------------

# Determine perfect matches to rankings
schoolNames = df.groupby(['universityCountry','school']).size().reset_index()

cwurNames = pd.merge(cwurNames, schoolNames, how='left', left_on = ['universityCountry','school'], right_on = ['universityCountry','school'])

matchedCWUR = cwurNames[cwurNames[0].notnull()]
unmatchedCWUR = cwurNames[cwurNames[0].isnull()]

# Determine unmatched universities
unmatchedGD = pd.merge(schoolNames, matchedCWUR, how='left', left_on = ['universityCountry','school',0], right_on = ['universityCountry','school',0])
unmatchedGD = unmatchedGD[unmatchedGD.Score.isnull()]

#-----------------------------
# Fuzzymatch unmatched rankings
#-----------------------------

newMatches = unmatchedCWUR['school'].apply(lambda x: process.extractOne(x,unmatchedGD.school,scorer=fuzz.token_sort_ratio))
newMatches = newMatches.reset_index()

nextName = [newMatches.school[x][0] for x in range(0,len(newMatches))]
nextScore = [int(newMatches.school[x][1]) for x in range(0,len(newMatches))]

nextResults = pd.DataFrame(np.stack((unmatchedCWUR.school,nextName,nextScore))).T
nextResults.rename(columns={0:'unmatched'},inplace=True)
nextResults.rename(columns={1:'match'},inplace=True)
nextResults.rename(columns={2:'score'},inplace=True)
backupResults = nextResults.copy()        

# Keep matches
nextResults['keepMatch'] = 0
nextResults.loc[nextResults.score==100,'keepMatch'] = 1
nextResults.loc[nextResults.score==99,'keepMatch'] = 1
nextResults.loc[nextResults.score==98,'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Johannes Gutenberg University Mainz','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Laval University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Paris 13 University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Duy Tân University','keepMatch'] = 1
nextResults.loc[nextResults.score==96,'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Hebei Agricultural University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas at Austin','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of British Columbia','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of New South Wales','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Western Australia','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Chinese University of Hong Kong','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Hong Kong Polytechnic University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas at Dallas','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas at Arlington','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas Medical Branch','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Puerto Rico at Mayagüez','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Erasmus University Rotterdam','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Colorado Anschutz Medical Campus','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='San Francisco de Quito University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Franklin & Marshall College','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='American University in Cairo','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Edinburgh','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Melbourne','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Uppsala University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Queensland','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Nottingham','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Sheffield','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Hong Kong','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Adelaide','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Aalborg University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Newcastle','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Stellenbosch University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='College of William & Mary','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='China Medical University, PRC','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Birla Institute of Technology and Science, Pilani','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Winnipeg','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Sydney','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Aarhus University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Maastricht University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Ohio University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Montana','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='SUNY Upstate Medical University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pablo de Olavide University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Koblenz-Landau','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='College of Staten Island','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Hashemite University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Tokyo','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Washington - Seattle','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Stockholm University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Hull','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Istanbul Bilgi University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Laval University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=="St. John's University",'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Puerto Rico, Río Piedras Campus','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Strasbourg','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Radboud University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Oklahoma, Norman','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pompeu Fabra University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of the West of England, Bristol','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Free University of Berlin','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Eduardo Mondlane University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Lund University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Stuttgart','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Babe?-Bolyai University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Mannheim','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Dokuz Eylül University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Sapienza University of Rome','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Hamburg','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Istanbul University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of the West Indies','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Ludwig Maximilian University of Munich','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Humboldt University of Berlin','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Soochow University, China','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Montana State University - Bozeman','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Brooklyn College','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Central University of Venezuela','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Federal University of Rio Grande do Sul','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Télécom ParisTech','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Y?ld?z Technical University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Alexandru Ioan Cuza University of Ia?i','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Sanjay Gandhi Postgraduate Institute of Medical Sciences','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Federal University of Rio de Janeiro','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Technical University of Dortmund','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Québec at Montreal','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Embry-Riddle Aeronautical University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Cheikh Anta Diop University of Dakar','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pennsylvania State University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Zurich','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Complutense University of Madrid','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Carlos III University of Madrid','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Jacobs University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pontifical Catholic University of Chile','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pontifical Catholic University of Rio de Janeiro','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Paris-Est Créteil','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Charles University in Prague','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='École centrale de Lille','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Arkansas - Fayetteville','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Hong Kong Polytechnic University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='TU Ilmenau','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='China University of Geosciences','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Santiago de Compostela','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of North Texas, Denton','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Valladolid','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Federal University of Pernambuco','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pontifical Catholic University of Rio Grande do Sul','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Istanbul Ayd?n University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=="Ca' Foscari University of Venice",'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Autonomous University of Barcelona','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Tennessee, Knoxville','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas at San Antonio','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Texas at El Paso','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Ulm University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Clermont Auvergne','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Western Cape','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='British University in Egypt','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Warwick','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Getúlio Vargas Foundation','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Guilan','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Fluminense Federal University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Antonio Nariño University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='K. N. Toosi University of Technology','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pontifical Javeriana University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Comillas Pontifical University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Bordeaux','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Poitiers','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Indian Institute of Technology (BHU) Varanasi','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Montpellier','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Technical University of Braunschweig','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Federal University of Rio Grande do Norte','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Federal Rural University of Rio de Janeiro','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='National Polytechnic Institute','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Metropolitan Autonomous University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Industrial University of Santander','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Benemérita Autonomous University of Puebla','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pontifical Catholic University of Paraná','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University POLITEHNICA of Bucharest','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Indian Institute of Technology (Indian School of Mines), Dhanbad','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University of Modena and Reggio Emilia','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Education University of Hong Kong','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Institute of Chemical Technology, Mumbai','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Indian Statistical Institute, Kolkata','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Shandong First Medical University & Shandong Academy of Medical Sciences','keepMatch'] = 1

nextResults = nextResults[nextResults.keepMatch==1]
del nextResults['keepMatch']

#-----------------------------
# Merge unmatched matches with fuzzymatch and add to matches
#-----------------------------

addSchools = pd.merge(unmatchedCWUR, nextResults, how='left', left_on = ['school'], right_on = ['unmatched'])
del addSchools['unmatched']
del addSchools[0]
del addSchools['score']
del addSchools['school']

addSchools.rename(columns={'match':'school'},inplace=True)
addSchools = addSchools[addSchools.school.notnull()]

# Combined all matched universities
addSchools = pd.merge(addSchools, schoolNames, how='left', left_on = ['universityCountry','school'], right_on = ['universityCountry','school'])
addSchools = addSchools[addSchools[0].notnull()]

matchedCWUR = matchedCWUR.append(addSchools)

# Determine still unmatched
unmatchedCWUR = pd.merge(cwurNames, matchedCWUR, how='left', left_on = ['World Rank'], right_on = ['World Rank'])
unmatchedCWUR = unmatchedCWUR[unmatchedCWUR['0_x'].isnull() & unmatchedCWUR['0_y'].isnull()]

#-----------------------------
# Manually add university merges by altering cwur ranking names
#-----------------------------

cwurNames.loc[cwurNames.school=='Columbia University','school'] = 'Columbia University in the City of New York'
cwurNames.loc[cwurNames.school=='University of Paris','school'] = 'Université de Paris'
cwurNames.loc[cwurNames.school=='Purdue University','school'] = 'Purdue University-Main Campus'
cwurNames.loc[cwurNames.school=='Ohio State University','school'] = 'Ohio State University-Main Campus'
cwurNames.loc[cwurNames.school=='University of Virginia','school'] = 'University of Virginia-Main Campus'
cwurNames.loc[cwurNames.school=='University of Pittsburgh','school'] = 'University of Pittsburgh-Pittsburgh Campus'
cwurNames.loc[cwurNames.school=='Georgia Institute of Technology','school'] = 'Georgia Institute of Technology-Main Campus'
cwurNames.loc[cwurNames.school=='University of Amsterdam','school'] = 'Universiteit van Amsterdam'
cwurNames.loc[cwurNames.school=='University of Geneva','school'] = 'Université de Genève'
cwurNames.loc[cwurNames.school=='KU Leuven','school'] = 'Katholieke Universiteit Leuven'
cwurNames.loc[cwurNames.school=='VU Amsterdam','school'] = 'Vrije Universiteit Amsterdam'
cwurNames.loc[cwurNames.school=='Arizona State University','school'] = 'Arizona State University-Tempe'
cwurNames.loc[cwurNames.school=='North Carolina State University','school'] = 'North Carolina State University at Raleigh'
cwurNames.loc[cwurNames.school=='Chinese University of Hong Kong','school'] = 'The Chinese University of Hong Kong'
cwurNames.loc[cwurNames.school=='Lomonosov Moscow State University','school'] = 'Moscow State University'
cwurNames.loc[cwurNames.school=='University of Vienna','school'] = 'Universität Wien'
cwurNames.loc[cwurNames.school=='University of Erlangen–Nuremberg','school'] = 'Friedrich-Alexander-Universität Erlangen-Nürnberg'
cwurNames.loc[cwurNames.school=='University of Cincinnati','school'] = 'University of Cincinnati-Main Campus'
cwurNames.loc[cwurNames.school=='Hong Kong University of Science and Technology','school'] = 'The Hong Kong University of Science and Technology'
cwurNames.loc[cwurNames.school=='Colorado State University','school'] = 'Colorado State University-Fort Collins'
cwurNames.loc[cwurNames.school=='Virginia Tech','school'] = 'Virginia Polytechnic Institute and State University'
cwurNames.loc[cwurNames.school=='Autonomous University of Madrid','school'] = 'Universidad Autónoma de Madrid'
cwurNames.loc[cwurNames.school=='Hong Kong Polytechnic University','school'] = 'The Hong Kong Polytechnic University'
cwurNames.loc[cwurNames.school=='University of South Florida','school'] = 'University of South Florida-St Petersburg'
cwurNames.loc[cwurNames.school=='Louisiana State University','school'] = 'Louisiana State University and Agricultural & Mechanical College'
cwurNames.loc[cwurNames.school=='University of New Mexico','school'] = 'University of New Mexico-Main Campus'
cwurNames.loc[cwurNames.school=='Chinese Academy of Medical Sciences & Peking Union Medical College','school'] = 'Peking Union Medical College'
cwurNames.loc[cwurNames.school=='Polytechnic University of Milan','school'] = 'Politecnico di Milano'
cwurNames.loc[cwurNames.school=='City College of New York','school'] = 'CUNY City College'
cwurNames.loc[cwurNames.school=='University at Albany, SUNY','school'] = 'SUNY at Albany'
cwurNames.loc[cwurNames.school=='University of Malaya','school'] = 'Universiti Malaya'
cwurNames.loc[cwurNames.school=='University of Colorado Denver','school'] = 'University of Colorado Denver/Anschutz Medical Campus'
cwurNames.loc[cwurNames.school=='Tulane University','school'] = 'Tulane University of Louisiana'
cwurNames.loc[cwurNames.school=='Catholic University of the Sacred Heart','school'] = 'Università Cattolica del Sacro Cuore'
cwurNames.loc[cwurNames.school=='TU Darmstadt','school'] = 'Technische Universität Darmstadt'
cwurNames.loc[cwurNames.school=='Oklahoma State University','school'] = 'Oklahoma State University-Main Campus'
cwurNames.loc[cwurNames.school=='TU Dortmund','school'] = 'Technische Universität Dortmund'
cwurNames.loc[cwurNames.school=='NOVA University of Lisbon','school'] = 'Universidade Nova de Lisboa'
cwurNames.loc[cwurNames.school=='Polytechnic University of Madrid','school'] = 'Universidad Politécnica de Madrid'
cwurNames.loc[cwurNames.school=='Brigham Young University','school'] = 'Brigham Young University-Provo'
cwurNames.loc[cwurNames.school=='Catholic University of Korea','school'] = 'The Catholic University of Korea'
cwurNames.loc[cwurNames.school=='University of New Hampshire','school'] = 'University of New Hampshire-Main Campus'
cwurNames.loc[cwurNames.school=='Bo?aziçi University','school'] = 'Bogazici University'
cwurNames.loc[cwurNames.school=='University of Salamanca','school'] = 'Universidad de Salamanca'
cwurNames.loc[cwurNames.school=='Sorbonne Paris Nord University','school'] = 'Université Paris 13'
cwurNames.loc[cwurNames.school=='Kent State University','school'] = 'Kent State University at Kent'
cwurNames.loc[cwurNames.school=='Centrale Lille Institut','school'] = 'Centrale Lille'
cwurNames.loc[cwurNames.school=='University of Wroc?aw','school'] = 'Uniwersytet Wroclawski'
cwurNames.loc[cwurNames.school=='New Mexico State University','school'] = 'New Mexico State University-Main Campus'
cwurNames.loc[cwurNames.school=='TU Kaiserslautern','school'] = 'Technische Universität Kaiserslautern'
cwurNames.loc[cwurNames.school=='Sofia University','school'] = 'Sofia University "St. Kliment Ohridski"'
cwurNames.loc[cwurNames.school=='COMSATS University Islamabad','school'] = 'COMSATS Institute of Information Technology'
cwurNames.loc[cwurNames.school=='University of Akron','school'] = 'University of Akron Main Campus'
cwurNames.loc[cwurNames.school=='North Dakota State University','school'] = 'North Dakota State University-Main Campus'
cwurNames.loc[cwurNames.school=='Hunter College','school'] = 'CUNY Hunter College'
cwurNames.loc[cwurNames.school=='Wright State University','school'] = 'Wright State University-Main Campus'
cwurNames.loc[cwurNames.school=='Bowling Green State University','school'] = 'Bowling Green State University-Main Campus'
cwurNames.loc[cwurNames.school=='University of Gda?sk','school'] = 'Uniwersytet Gdanski'
cwurNames.loc[cwurNames.school=='Uluda? University','school'] = 'Uludag Üniversitesi'
cwurNames.loc[cwurNames.school=='TU Ilmenau','school'] = 'Technische Universität Ilmenau'
cwurNames.loc[cwurNames.school=='Sabanc? University','school'] = 'Sabanci Üniversitesi'
cwurNames.loc[cwurNames.school=='MARA University of Technology','school'] = 'Universiti Teknologi MARA'
cwurNames.loc[cwurNames.school=='Ba?kent University','school'] = 'Baskent Üniversitesi'
cwurNames.loc[cwurNames.school=='Bahçe?ehir University','school'] = 'Bahçesehir Üniversitesi'
cwurNames.loc[cwurNames.school=='Lyon 2 University','school'] = 'Université Lumière Lyon 2'
cwurNames.loc[cwurNames.school=='University of Hong Kong','school'] = 'The University of Hong Kong'
cwurNames.loc[cwurNames.school=='Miami University','school'] = 'Miami University-Oxford'
cwurNames.loc[cwurNames.school=='PETRONAS University of Technology','school'] = 'Universiti Teknologi Petronas'
cwurNames.loc[cwurNames.school=='Queens College, City University of New York','school'] = 'CUNY Queens College'
cwurNames.loc[cwurNames.school=='Autonomous University of Barcelona','school'] = 'Universidad Autónoma de Barcelona'
cwurNames.loc[cwurNames.school=='Pace University','school'] = 'Pace University-New York'
cwurNames.loc[cwurNames.school=='Sofia University "St. Kliment Ohridski"','school'] = 'Sofia University St. Kliment Ohridski'
cwurNames.loc[cwurNames.school=='University of Indonesia','school'] = 'Universitas Indonesia'
cwurNames.loc[cwurNames.school=='Indian Institute of Technology (BHU) Varanasi','school'] = 'Indian Institute of Technology, BHU'
cwurNames.loc[cwurNames.school=='TU Dresden','school'] = 'Technische Universität Dresden'
cwurNames.loc[cwurNames.school=='University of Valencia','school'] = 'Universidad de Valencia'
cwurNames.loc[cwurNames.school=='University of Parma','school'] = 'Università degli Studi di Parma'
cwurNames.loc[cwurNames.school=='Ulm University','school'] = 'Universität Ulm'
cwurNames.loc[cwurNames.school=='University of Pavia','school'] = 'Università degli Studi di Pavia'
cwurNames.loc[cwurNames.school=='University of Campinas','school'] = 'Universidade Estadual de Campinas'
cwurNames.loc[cwurNames.school=='Federal University of Minas Gerais','school'] = 'Universidade Federal de Minas Gerais'
cwurNames.loc[cwurNames.school=='University of Navarra','school'] = 'Universidad de Navarra'
cwurNames.loc[cwurNames.school=='University of Zaragoza','school'] = 'Universidad de Zaragoza'
cwurNames.loc[cwurNames.school=='Polytechnic University of Valencia','school'] = 'Universidad Politécnica de València'
cwurNames.loc[cwurNames.school=='University of Trieste','school'] = 'Università degli Studi di Trieste'
cwurNames.loc[cwurNames.school=='University of Trento','school'] = 'Università degli Studi di Trento'
cwurNames.loc[cwurNames.school=='University of Clermont Auvergne','school'] = 'Université Clermont Auvergne'
cwurNames.loc[cwurNames.school=='University of Bordeaux','school'] = 'Université de Bordeaux'
cwurNames.loc[cwurNames.school=='University of Oviedo','school'] = 'Universidad de Oviedo'
cwurNames.loc[cwurNames.school=='University of Salerno','school'] = 'Università degli Studi di Salerno'
cwurNames.loc[cwurNames.school=='Rio de Janeiro State University','school'] = 'Universidade do Estado do Rio de Janeiro'
cwurNames.loc[cwurNames.school=='National Polytechnic Institute','school'] = 'Instituto Politécnico Nacional'
cwurNames.loc[cwurNames.school=='Federal University of Santa Catarina','school'] = 'Universidade Federal de Santa Catarina'
cwurNames.loc[cwurNames.school=='University of Málaga','school'] = 'Universidad de Málaga'
cwurNames.loc[cwurNames.school=='University of Science, Malaysia','school'] = 'Universiti Sains Malaysia'
cwurNames.loc[cwurNames.school=='Getúlio Vargas Foundation','school'] = 'Fundação Getúlio Vargas'
cwurNames.loc[cwurNames.school=='University of Murcia','school'] = 'Universidad de Murcia'
cwurNames.loc[cwurNames.school=='Federal University of Paraná','school'] = 'Universidade Federal do Paraná'
cwurNames.loc[cwurNames.school=='National University of Malaysia, UKM','school'] = 'Universiti Kebangsaan Malaysia'
cwurNames.loc[cwurNames.school=='University of Alicante','school'] = 'Universidad de Alicante'
cwurNames.loc[cwurNames.school=='Federal University of Ceará','school'] = 'Universidade Federal do Ceará'
cwurNames.loc[cwurNames.school=='Federal University of Bahia','school'] = 'Universidade Federal da Bahia'
cwurNames.loc[cwurNames.school=='Federal University of São Carlos','school'] = 'Universidade Federal de São Carlos'
cwurNames.loc[cwurNames.school=='Wroc?aw University of Science and Technology','school'] = 'Politechnika Wroclawska'
cwurNames.loc[cwurNames.school=='University of Technology, Malaysia','school'] = 'Universiti Teknologi Malaysia'
cwurNames.loc[cwurNames.school=='King Juan Carlos University','school'] = 'Universidad Rey Juan Carlos'
cwurNames.loc[cwurNames.school=='University of Cádiz','school'] = 'Universidad de Cádiz'
cwurNames.loc[cwurNames.school=='University of Puerto Rico-Mayaguez','school'] = 'University of Puerto Rico-Mayaguez'
cwurNames.loc[cwurNames.school=='University of Siena','school'] = 'Università degli Studi di Siena'
cwurNames.loc[cwurNames.school=='?zmir Institute of Technology','school'] = 'Izmir Yüksek Teknoloji Enstitüsü'
cwurNames.loc[cwurNames.school=='Cz?stochowa University of Technology','school'] = 'Politechnika Czestochowska'
cwurNames.loc[cwurNames.school=='Minya University','school'] = 'Minia University'
cwurNames.loc[cwurNames.school=='University of Veracruz','school'] = 'Universidad Veracruzana'
cwurNames.loc[cwurNames.school=='University of Rzeszów','school'] = 'Uniwersytet Rzeszowski'
cwurNames.loc[cwurNames.school=='Catholic University of Portugal','school'] = 'Universidade Católica Portuguesa'
cwurNames.loc[cwurNames.school=='Antonio Nariño University','school'] = 'Universidad Antonio Nariño'
cwurNames.loc[cwurNames.school=='Industrial University of Santander','school'] = 'Universidad Industrial de Santander'
cwurNames.loc[cwurNames.school=='Vienna University of Economics and Business','school'] = 'Wirtschaftsuniversität Wien'
cwurNames.loc[cwurNames.school=='Comillas Pontifical University','school'] = 'Universidad Pontificia Comillas'
cwurNames.loc[cwurNames.school=='University of Modena and Reggio Emilia','school'] = 'Università Degli Studi Di Modena E Reggio Emilia'
cwurNames.loc[cwurNames.school=='Gda?sk University of Technology','school'] = 'Politechnika Gdanska'
cwurNames.loc[cwurNames.school=='Tallinn University of Technology','school'] = 'Tallinna Tehnikaülikool'
cwurNames.loc[cwurNames.school=='University of Tartu','school'] = 'Tartu Ülikool'
cwurNames.loc[cwurNames.school=='Monterrey Institute of Technology and Higher Education','school'] = 'Tecnológico de Monterrey'
cwurNames.loc[cwurNames.school=='West University of Timi?oara','school'] = 'West University of Timisoara'
cwurNames.loc[cwurNames.school=='Polytechnic University of Timi?oara','school'] = 'The Polytechnic University of Timisoara'
cwurNames.loc[cwurNames.school=='Pontifical Catholic University of Paraná','school'] = 'Pontifícia Universidade Católica do Paraná'
cwurNames.loc[cwurNames.school=='Pontifical Javeriana University','school'] = 'Pontificia Universidad Javeriana'
cwurNames.loc[cwurNames.school=='University of Havana','school'] = 'Universidad de La Habana'
cwurNames.loc[cwurNames.school=='Hamburg University of Technology','school'] = 'Technische Universität Hamburg'
cwurNames.loc[cwurNames.school=='Indian Institute of Technology (Indian School of Mines), Dhanbad','school'] = 'Indian Institute of Technology Dhanbad'
cwurNames.loc[cwurNames.school=='Pontifical Catholic University of Peru','school'] = 'Pontificia Universidad Católica del Perú'
cwurNames.loc[cwurNames.school=='University of Montpellier','school'] = 'Université de Montpellier'
cwurNames.loc[cwurNames.school=='University of Warwick','school'] = 'The University of Warwick'
cwurNames.loc[cwurNames.school=='University of Tennessee, Knoxville','school'] = 'The University of Tennessee-Knoxville'
cwurNames.loc[cwurNames.school=='University of Lorraine','school'] = 'Université de Lorraine'
cwurNames.loc[cwurNames.school=='Vienna University of Technology','school'] = 'Technische Universität Wien'
cwurNames.loc[cwurNames.school=='National University of La Plata','school'] = 'Universidad Nacional de La Plata'
cwurNames.loc[cwurNames.school=='University of Texas at San Antonio','school'] = 'The University of Texas at San Antonio'
cwurNames.loc[cwurNames.school=='Graz University of Technology','school'] = 'Technische Universität Graz'
cwurNames.loc[cwurNames.school=='National University of Córdoba','school'] = 'Universidad Nacional de Córdoba'
cwurNames.loc[cwurNames.school=='University of Poitiers','school'] = 'Université de Poitiers'
cwurNames.loc[cwurNames.school=='University POLITEHNICA of Bucharest','school'] = 'Universitatea Politehnica din Bucuresti'
cwurNames.loc[cwurNames.school=='Fluminense Federal University','school'] = 'Universidade Federal Fluminense'
cwurNames.loc[cwurNames.school=='Metropolitan Autonomous University','school'] = 'Universidad Autónoma Metropolitana'
cwurNames.loc[cwurNames.school=='University of Texas at El Paso','school'] = 'The University of Texas at El Paso'
cwurNames.loc[cwurNames.school=='University of Western Cape','school'] = 'University of the Western Cape'
cwurNames.loc[cwurNames.school=='National University of Rosario','school'] = 'Universidad Nacional de Rosario'
cwurNames.loc[cwurNames.school=='University of Valle','school'] = 'Universidad del Valle'
cwurNames.loc[cwurNames.school=='Mohamed First University Oujda','school'] = 'Université Mohammed Premier'

# 2022 additions
cwurNames.loc[cwurNames.school=='National University of the South','school'] = 'Universidad Nacional del Sur'
cwurNames.loc[cwurNames.school=='University of Veterinary Medicine, Vienna','school'] = 'Veterinärmedizinische Universität Wien'
cwurNames.loc[cwurNames.school=='Federal University of ABC','school'] = 'Universidade Federal do ABC'
cwurNames.loc[cwurNames.school=='University of Veterinary Medicine Hanover','school'] = 'Tierärztliche Hochschule Hannover'
cwurNames.loc[cwurNames.school=='Freiberg University of Mining and Technology','school'] = 'Technische Universität Bergakademie Freiberg'
cwurNames.loc[cwurNames.school=='University of Guanajuato','school'] = 'Universidad de Guanajuato'
cwurNames.loc[cwurNames.school=='University of Fribourg','school'] = 'Université de Fribourg'
cwurNames.loc[cwurNames.school=='University of the Republic, Uruguay','school'] = 'Universidad de la República'

#-----------------------------
# Manually add university merges by altering cwur universities conditional on country
#-----------------------------

cwurNames.loc[(cwurNames.school=='University of Palermo') & (cwurNames.universityCountry=='Italy'),'school'] = 'Università degli Studi di Palermo'
cwurNames.loc[(cwurNames.school=='Heidelberg University') & (cwurNames.universityCountry=='Germany'),'school'] = 'Ruprecht-Karls-Universität Heidelberg'
cwurNames.loc[(cwurNames.school=='Lincoln University') & (cwurNames.universityCountry=='New Zealand'),'school'] = 'Lincoln University, New Zealand'

#-----------------------------
# Manually add university merges by altering cwur universities
#-----------------------------

cwurNames.loc[cwurNames.school=='The University of Hong Kong','universityCountry'] = 'Hong Kong'
cwurNames.loc[cwurNames.school=='The Chinese University of Hong Kong','universityCountry'] = 'Hong Kong'
cwurNames.loc[cwurNames.school=='The Hong Kong University of Science and Technology','universityCountry'] = 'Hong Kong'
cwurNames.loc[cwurNames.school=='City University of Hong Kong','universityCountry'] = 'Hong Kong'
cwurNames.loc[cwurNames.school=='The Hong Kong Polytechnic University','universityCountry'] = 'Hong Kong'
cwurNames.loc[cwurNames.school=='Hong Kong Baptist University','universityCountry'] = 'Hong Kong'

df.loc[df.school=='University of Puerto Rico-Mayaguez','universityCountry'] = 'Puerto Rico'

#-----------------------------
# Merge remaining unmatched with newly changed
#-----------------------------

del cwurNames[0]

matchedCWURNames = pd.merge(cwurNames, schoolNames, how='left', left_on = ['universityCountry','school'], right_on = ['universityCountry','school'])

matchedCWURNames = matchedCWURNames[matchedCWURNames[0].notnull()]

matchedCWUR = matchedCWUR.append(matchedCWURNames)

matchedCWUR = matchedCWUR.groupby(['World Rank']).head(1)

# Determine still unmatched
unmatchedCWUR = pd.merge(cwurNames, matchedCWUR, how='left', left_on = ['World Rank'], right_on = ['World Rank'])
unmatchedCWUR = unmatchedCWUR[unmatchedCWUR[0].isnull()]

# Determine unmatched universities
unmatchedGD = pd.merge(schoolNames, matchedCWUR, how='left', left_on = ['universityCountry','school',0], right_on = ['universityCountry','school',0])
unmatchedGD = unmatchedGD[unmatchedGD.Score.isnull()]

#-----------------------------
# Keep all matches
#-----------------------------

matchedCWUR['world_rank'] = matchedCWUR['World Rank'].apply(lambda x: int(x))

matchedCWUR['national_rank'] = matchedCWUR['National Rank'].apply(lambda x: int(x))

matchedCWUR = matchedCWUR.sort_values('world_rank')

del matchedCWUR['World Rank']
del matchedCWUR['National Rank']
del matchedCWUR[0]

#-----------------------------
# Clean up ranking dataset
#-----------------------------

matchedCWUR['quality_of_education'] = matchedCWUR['Quality\xa0of Education']
matchedCWUR.loc[matchedCWUR.quality_of_education=='-','quality_of_education'] = np.nan
matchedCWUR['quality_of_education'] = matchedCWUR['quality_of_education'].apply(lambda x: float(x))
del matchedCWUR['Quality\xa0of Education']

matchedCWUR['quality_of_faculty'] = matchedCWUR['Quality\xa0of Faculty']
matchedCWUR.loc[matchedCWUR.quality_of_faculty=='-','quality_of_faculty'] = np.nan
matchedCWUR['quality_of_faculty'] = matchedCWUR['quality_of_faculty'].apply(lambda x: float(x))
del matchedCWUR['Quality\xa0of Faculty']

matchedCWUR['alumni_employment'] = matchedCWUR['Alumni Employment']
matchedCWUR.loc[matchedCWUR.alumni_employment=='-','alumni_employment'] = np.nan
matchedCWUR['alumni_employment'] = matchedCWUR['alumni_employment'].apply(lambda x: float(x))
del matchedCWUR['Alumni Employment']

matchedCWUR['research_performance'] = matchedCWUR['Research Performance']
matchedCWUR.loc[matchedCWUR.research_performance=='-','research_performance'] = np.nan
matchedCWUR['research_performance'] = matchedCWUR['research_performance'].apply(lambda x: float(x))
del matchedCWUR['Research Performance']

#-----------------------------
# Merge rankings and determine which still unmatched
#-----------------------------

df = pd.merge(df, matchedCWUR, how='left', left_on = ['universityCountry','school'], right_on = ['universityCountry','school'])

hasWorldRank = df[df.world_rank.notnull()].groupby('world_rank').size().reset_index()

cwurNames['world_rank'] = cwurNames['World Rank'].apply(lambda x: int(x))

stillUnmatched = pd.merge(cwurNames, hasWorldRank, how='left', left_on = ['world_rank'], right_on = ['world_rank'])

stillUnmatched = stillUnmatched[stillUnmatched[0].isnull()]

missingNation = df[df.school.notnull() & df.universityCountry.isnull()].groupby('school').size().reset_index() 

#-----------------------------
# Merge rankings for second school 
#-----------------------------

matchedCWUR_2 = matchedCWUR[['school','universityCountry','world_rank','national_rank']]

matchedCWUR_2.rename(columns={'school':'school_2'},inplace=True)
matchedCWUR_2.rename(columns={'universityCountry':'universityCountry_2'},inplace=True)
matchedCWUR_2.rename(columns={'world_rank':'world_rank_2'},inplace=True)
matchedCWUR_2.rename(columns={'national_rank':'national_rank_2'},inplace=True)

df = pd.merge(df, matchedCWUR_2, how='left', left_on = ['universityCountry_2','school_2'], right_on = ['universityCountry_2','school_2'])


##############################################################################
# Drop U.S. institutions that are not 4-year institutions
##############################################################################
    
print("Exclude U.S. institutions that are not 4-year schools.")

#-----------------------------
# Read in school info (alias, type, location)
#-----------------------------

schoolNames = pd.read_csv(inputPath + "IPEDS_data_schools.csv")

schoolInfo = pd.read_csv(inputPath + "IPEDS_add_schools_info.csv")

schoolLevel = schoolInfo[schoolInfo.VariableName=='Level of institution (HD2017)']
schoolLevel.rename(columns={'ValueLabel':'schoolLevel'},inplace=True)
del schoolLevel['VariableName']

schoolCBSA = schoolInfo[schoolInfo.VariableName=='Core Based Statistical Area (CBSA) (HD2017)']
schoolCBSA.rename(columns={'ValueLabel':'cbsa'},inplace=True)
del schoolCBSA['VariableName']

schoolNames = pd.merge(schoolNames, schoolLevel, how='left', left_on = ['Level of institution (HD2017)'], right_on = ['Value'])
schoolNames = pd.merge(schoolNames, schoolCBSA, how='left', left_on = ['Core Based Statistical Area (CBSA) (HD2017)'], right_on = ['Value'])

schoolNames.rename(columns={'Institution Name':'school'},inplace=True)
schoolNames.rename(columns={'Institution name alias (HD2017)':'alias'},inplace=True)

schoolNames = schoolNames[['school','alias','schoolLevel','cbsa']]

#-----------------------------
# Extract all aliases and keep only unique ones
#-----------------------------

noSplitName = np.array(schoolNames[schoolNames.alias.notnull() & schoolNames.alias.apply(lambda x: str(x).count('|')==0)].school)
noSplitAlias = np.array(schoolNames[schoolNames.alias.notnull() & schoolNames.alias.apply(lambda x: str(x).count('|')==0)].alias)

splitName = np.array(schoolNames[schoolNames.alias.apply(lambda x: str(x).count('|')>0)].school)
splitAlias = np.array(schoolNames[schoolNames.alias.apply(lambda x: str(x).count('|')>0)].alias.apply(lambda x: str(x).split('|')))

allAlias = pd.DataFrame([])

for ii in range(0,len(splitAlias)):
    for jj in range(0,len(splitAlias[ii])):
        
        nextAlias = pd.DataFrame([splitName[ii],str(splitAlias[ii][jj]).lstrip().rstrip()]).T

        allAlias = allAlias.append(nextAlias)

for ii in range(0,len(noSplitAlias)):

        nextAlias = pd.DataFrame([noSplitName[ii],str(noSplitAlias[ii]).lstrip().rstrip()]).T

        allAlias = allAlias.append(nextAlias)
        
allAlias.rename(columns={0:'school'},inplace=True)
allAlias.rename(columns={1:'alias'},inplace=True)

# Keep unique instances
allAlias = allAlias[allAlias.alias!=""]
allAlias = allAlias.groupby(['school','alias']).head(1)        
allAlias['aliasCount'] = allAlias.groupby('alias').cumcount()
aliasCount = pd.DataFrame(allAlias.groupby('alias').aliasCount.max()).reset_index()
allAlias = pd.merge(allAlias, aliasCount, how='left', left_on = ['alias'], right_on = ['alias'])
allAlias = allAlias[allAlias.aliasCount_y==0]
allAlias = allAlias[['school','alias']]

#-----------------------------
# Combine schools and aliases for mergeing cleanly
#-----------------------------

mergeInfo = schoolNames[['school','schoolLevel','cbsa']]
allAlias = pd.merge(allAlias, mergeInfo, how='left', left_on = ['school'], right_on = ['school'])
allAlias.rename(columns={'alias':'mergeSchool'},inplace=True)
allAlias = allAlias[['mergeSchool','school','cbsa','schoolLevel']]

schoolMerge = schoolNames.copy()
schoolMerge['mergeSchool'] = schoolMerge['school']
del schoolMerge['alias']
schoolMerge = schoolMerge[['mergeSchool','school','cbsa','schoolLevel']]

schoolMerge = schoolMerge.append(allAlias)
schoolMerge = schoolMerge.sort_values(['school','mergeSchool'])

del schoolCBSA
del schoolInfo
del schoolLevel
del schoolNames
del splitAlias
del splitName
del aliasCount
del allAlias
del mergeInfo
del noSplitAlias
del noSplitName
del nextAlias

check = schoolMerge[schoolMerge.school.apply(lambda x: str(x).count('-')>0)].school

#-----------------------------
# Remove Location distinction from the school name for merge
#-----------------------------

for repWord in ['Design','College','University','School','Arts','Institute','Academy','Technology']:
    schoolMerge.loc[schoolMerge.school.apply(lambda x: str(x).count(repWord + "-")>0),'trimSchool'] = schoolMerge[schoolMerge.school.apply(lambda x: str(x).count(repWord + "-")>0)].school.apply(lambda x: x.split(repWord + "-")[0] + repWord)

# Replace only for those where its not representing clearly different universities
schoolMerge.loc[schoolMerge.trimSchool=='Academy of Hair Design','mergeSchool'] = 'Academy of Hair Design'
schoolMerge.loc[schoolMerge.trimSchool=='Altierus Career College','mergeSchool'] = 'Altierus Career College'
schoolMerge.loc[schoolMerge.trimSchool=='All-State Career School','mergeSchool'] = 'All-State Career School'
schoolMerge.loc[schoolMerge.trimSchool=='American Career College','mergeSchool'] = 'Altierus Career College'
schoolMerge.loc[schoolMerge.trimSchool=='Antioch University','mergeSchool'] = 'Antioch University'
schoolMerge.loc[schoolMerge.trimSchool=='Argosy University','mergeSchool'] = 'Argosy University'
schoolMerge.loc[schoolMerge.trimSchool=='Aveda Institute','mergeSchool'] = 'Aveda Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Branford Hall Career Institute','mergeSchool'] = 'Branford Hall Career Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Brightwood Career Institute','mergeSchool'] = 'Brightwood Career Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Brightwood College','mergeSchool'] = 'Brightwood College'
schoolMerge.loc[schoolMerge.trimSchool=='Brown Mackie College','mergeSchool'] = 'Brown Mackie College'
schoolMerge.loc[schoolMerge.trimSchool=='Bryant & Stratton College','mergeSchool'] = 'Bryant & Stratton College'
schoolMerge.loc[schoolMerge.trimSchool=='Cannella School of Hair Design','mergeSchool'] = 'Cannella School of Hair Design'
schoolMerge.loc[schoolMerge.trimSchool=='Capri Institute of Hair Design','mergeSchool'] = 'Capri Institute of Hair Design'
schoolMerge.loc[schoolMerge.trimSchool=='Career Training Academy','mergeSchool'] = 'Career Training Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Carrington College','mergeSchool'] = 'Carrington College'
schoolMerge.loc[schoolMerge.trimSchool=='Centura College','mergeSchool'] = 'Centura College'
schoolMerge.loc[schoolMerge.trimSchool=='Chamberlain University','mergeSchool'] = 'Chamberlain University'
schoolMerge.loc[schoolMerge.trimSchool=='Concorde Career College','mergeSchool'] = 'Concorde Career College'
schoolMerge.loc[schoolMerge.trimSchool=='Cortiva Institute','mergeSchool'] = 'Cortiva Institute'
schoolMerge.loc[schoolMerge.trimSchool=='DeVry University','mergeSchool'] = 'DeVry University'
schoolMerge.loc[schoolMerge.trimSchool=='Embry-Riddle Aeronautical University','mergeSchool'] = 'Embry-Riddle Aeronautical University'
schoolMerge.loc[schoolMerge.trimSchool=='Empire Beauty School','mergeSchool'] = 'Empire Beauty School'
schoolMerge.loc[schoolMerge.trimSchool=='Florida Career College','mergeSchool'] = 'Florida Career College'
schoolMerge.loc[schoolMerge.trimSchool=='Fortis College','mergeSchool'] = 'Fortis College'
schoolMerge.loc[schoolMerge.trimSchool=='Fortis Institute','mergeSchool'] = 'Fortis Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Herzing University','mergeSchool'] = 'Herzing University'
schoolMerge.loc[schoolMerge.trimSchool=='House of Heavilin Beauty College','mergeSchool'] = 'House of Heavilin Beauty College'
schoolMerge.loc[schoolMerge.trimSchool=='ITT Technical Institute','mergeSchool'] = 'ITT Technical Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Interactive College of Technology','mergeSchool'] = 'Interactive College of Technology'
schoolMerge.loc[schoolMerge.trimSchool=='Kaplan University','mergeSchool'] = 'Kaplan University'
schoolMerge.loc[schoolMerge.trimSchool=='Le Cordon Bleu College of Culinary Arts','mergeSchool'] = 'Le Cordon Bleu College of Culinary Arts'
schoolMerge.loc[schoolMerge.trimSchool=='Lincoln College of Technology','mergeSchool'] = 'Lincoln College of Technology'
schoolMerge.loc[schoolMerge.trimSchool=='Lincoln Technical Institute','mergeSchool'] = 'Lincoln Technical Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Milan Institute','mergeSchool'] = 'Milan Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Miller-Motte College','mergeSchool'] = 'Miller-Motte College'
schoolMerge.loc[schoolMerge.trimSchool=='Miller-Motte Technical College','mergeSchool'] = 'Miller-Motte Technical College'
schoolMerge.loc[schoolMerge.trimSchool=='National American University','mergeSchool'] = 'National American University'
schoolMerge.loc[schoolMerge.trimSchool=='Orange Technical College','mergeSchool'] = 'Orange Technical College'
schoolMerge.loc[schoolMerge.trimSchool=='Paul Mitchell the School','mergeSchool'] = 'Paul Mitchell the School'
schoolMerge.loc[schoolMerge.trimSchool=='Pima Medical Institute','mergeSchool'] = 'Pima Medical Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Platt College','mergeSchool'] = 'Platt College'
schoolMerge.loc[schoolMerge.trimSchool=='Remington College','mergeSchool'] = 'Remington College'
schoolMerge.loc[schoolMerge.trimSchool=='SAE Institute of Technology','mergeSchool'] = 'SAE Institute of Technology'
schoolMerge.loc[schoolMerge.trimSchool=='Salon Success Academy','mergeSchool'] = 'Salon Success Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Sanford-Brown College','mergeSchool'] = 'Sanford-Brown College'
schoolMerge.loc[schoolMerge.trimSchool=='South Texas Vocational Technical Institute','mergeSchool'] = 'South Texas Vocational Technical Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Southern Careers Institute','mergeSchool'] = 'Southern Careers Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Strayer University','mergeSchool'] = 'Strayer University'
schoolMerge.loc[schoolMerge.trimSchool=='Summit Salon Academy','mergeSchool'] = 'Summit Salon Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Tennessee College of Applied Technology','mergeSchool'] = 'Tennessee College of Applied Technology'
schoolMerge.loc[schoolMerge.trimSchool=='The Salon Professional Academy','mergeSchool'] = 'The Salon Professional Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Toni & Guy Hairdressing Academy','mergeSchool'] = 'Toni & Guy Hairdressing Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Tulsa Welding School','mergeSchool'] = 'Tulsa Welding School'
schoolMerge.loc[schoolMerge.trimSchool=='Ultimate Medical Academy','mergeSchool'] = 'Ultimate Medical Academy'
schoolMerge.loc[schoolMerge.trimSchool=='Unitech Training Academy','mergeSchool'] = 'Unitech Training Academy'
schoolMerge.loc[schoolMerge.trimSchool=='United Education Institute','mergeSchool'] = 'United Education Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Universal Technical Institute','mergeSchool'] = 'Universal Technical Institute'
schoolMerge.loc[schoolMerge.trimSchool=='Vatterott College','mergeSchool'] = 'Vatterott College'
schoolMerge.loc[schoolMerge.trimSchool=='Virginia College','mergeSchool'] = 'Virginia College'
schoolMerge.loc[schoolMerge.trimSchool=='Xenon International Academy','mergeSchool'] = 'Xenon International Academy'
schoolMerge.loc[schoolMerge.trimSchool=='YTI Career Institute','mergeSchool'] = 'YTI Career Institute'

schoolMerge = schoolMerge.groupby('mergeSchool').head(1)

del schoolMerge['trimSchool']

#-----------------------------
# Merge with main dataset and only keep institutions that can line up with the degrees offered
#-----------------------------

schoolMerge = schoolMerge[['school','schoolLevel']]

schoolMerge = schoolMerge.groupby('school').head(1)

# 4-year institution degrees 
df['fourYearDegree'] = 1 * ( (df.degree=='BACHELORS') | (df.degree=='JD') | (df.degree=='MASTERS') | (df.degree=='MBA') | (df.degree=='PHD') | (df.degree=='POSTGRAD'))
df['fourYearDegree2'] = 1 * ( (df.degree_2=='BACHELORS') | (df.degree_2=='JD') | (df.degree_2=='MASTERS') | (df.degree_2=='MBA') | (df.degree_2=='PHD') | (df.degree_2=='POSTGRAD'))

# First school
df = pd.merge(df, schoolMerge, how='left', left_on = ['school'], right_on = ['school'])

df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree==1) | (df.degree=='ASSOCIATES')),'school'] = np.nan
df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree==1) | (df.degree=='ASSOCIATES')),'Nation'] = np.nan
df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree==1) | (df.degree=='ASSOCIATES')),'universityCountry'] = np.nan

df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree==1),'school'] = np.nan
df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree==1),'Nation'] = np.nan
df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree==1),'universityCountry'] = np.nan

del df['schoolLevel']

# Second school
schoolMerge.rename(columns={'school':'school_2'},inplace=True)

df = pd.merge(df, schoolMerge, how='left', left_on = ['school_2'], right_on = ['school_2'])

df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree2==1) | (df.degree_2=='ASSOCIATES')),'school_2'] = np.nan
df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree2==1) | (df.degree_2=='ASSOCIATES')),'Nation_2'] = np.nan
df.loc[(df.schoolLevel=='Less than 2 years (below associate)') & ((df.fourYearDegree2==1) | (df.degree_2=='ASSOCIATES')),'universityCountry_2'] = np.nan

df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree2==1),'school_2'] = np.nan
df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree2==1),'Nation_2'] = np.nan
df.loc[(df.schoolLevel=='At least 2 but less than 4 years') & (df.fourYearDegree2==1),'universityCountry_2'] = np.nan

del df['schoolLevel']

del df['fourYearDegree']
del df['fourYearDegree2']

##############################################################################
# Drop if bonus currency not the same as base currency
##############################################################################

for currVar in ['cashCurrency','stockCurrency','salesCurrency','tipsCurrency','priftCurrency']:
    
    df.loc[df[currVar].notnull(),'diffCurrency'] = 1 * (df[df[currVar].notnull()][currVar] != df[df[currVar].notnull()]['baseCurrency'])

    print("Incorrect currencies = " + str(df.diffCurrency.sum().item()))

    df = df[df.diffCurrency!=1]
    
    del df['diffCurrency']
    
for currVar in ['cashCurrency','stockCurrency','salesCurrency','tipsCurrency','priftCurrency']:
    
    del df[currVar]
    
##############################################################################
# Drop some variables
##############################################################################

print("Repeated Observations = " + str(((df.groupby('salID').yearOfSalary.size()>1).sum().item())))

df = df.groupby('salID').head(1)

del df['GOC']
del df['employeesTotalNum']

del df['basePayAmount']
del df['cashBonusAmount']
del df['stockBonusAmount']
del df['profitSharingAmount']
del df['salesCommissionAmount']
del df['tipsAmount']

del df['startSchool_2']
del df['endSchool_2']
del df['Nation_2']

##############################################################################
# Merge US scorecard data
##############################################################################

#-----------------------------
# Create degree category for merging
#-----------------------------

df['degreeScorecard'] = df['degree']

df.loc[df.degreeScorecard=='MBA','degreeScorecard'] = 'MASTERS'

df.loc[df.degreeScorecard=='JD','degreeScorecard'] = np.nan

df.loc[df.degreeScorecard=='missing','degreeScorecard'] = np.nan

df.loc[df.degreeScorecard=='UNMATCHED','degreeScorecard'] = np.nan

#-----------------------------
# Create degree category for merging
#-----------------------------

scorecard = pd.read_csv(savePath + "US_scorecard_earnings.csv")
del scorecard['Unnamed: 0']

#-----------------------------
# Clean up scorecard university names for merging
#-----------------------------

scorecard.loc[scorecard.school=='Pace University','school'] = 'Pace University-New York'
scorecard.loc[scorecard.school=='The Catholic University of America','school'] = 'Catholic University of America'
scorecard.loc[scorecard.school=='Rutgers University-Newark','school'] = 'Rutgers University–Newark'
scorecard.loc[scorecard.school=='Johnson & Wales University-Providence','school'] = 'Johnson & Wales University'
scorecard.loc[scorecard.school=='Long Island University','school'] = 'LIU-University Center Campus'

df = pd.merge(df, scorecard, how='left', left_on = ['school','degreeScorecard','grpMajor'], right_on = ['school','degreeScorecard','grpMajor'])

#-----------------------------
# Check which US universities didn't merge with scorecard
#-----------------------------

scorecard_uni = scorecard.groupby('school').size().reset_index()

us_uni = df[df.universityCountry=='United States'].groupby('school').yearOfSalary.size().reset_index()

scorecardCheck = pd.merge(scorecard_uni, us_uni, how='left', left_on = ['school'], right_on = ['school'])

scorecardCheck = scorecardCheck[scorecardCheck['yearOfSalary'].isnull()]

##############################################################################
# Merge country university count
##############################################################################

universityCount = pd.read_csv(inputPath + "Country_university_total.csv")

universityCount.rename(columns={'Country':'universityCountry'},inplace=True)
universityCount.rename(columns={'Number of Universities':'numberUniversities'},inplace=True)

df = pd.merge(df, universityCount, how='left', left_on = ['universityCountry'], right_on = ['universityCountry'])

##############################################################################
# Add university specific patent/cite data from Chetty
##############################################################################

# Read in patents/cites data
patentData = pd.read_csv(inputPath + 'University_patent_cite_data.csv')

patentData = patentData[['instnm','inventor','total_patents','total_cites','top5cit']]
patentData = patentData[patentData.instnm.notnull()]

patentData.loc[patentData.instnm=='California Polytechnic State University','instnm'] = "California Polytechnic State University-San Luis Obispo"

# Fuzzymatch to US match in GD data
usUni = df[df.universityCountry=='United States'].groupby('school').head(1)[['school']]
usUni = usUni[usUni.school.notnull()]

newMatches = patentData['instnm'].apply(lambda x: process.extractOne(x,usUni.school,scorer=fuzz.token_sort_ratio))
newMatches = newMatches.reset_index()

nextName = [newMatches.instnm[x][0] for x in range(0,len(newMatches))]
nextScore = [int(newMatches.instnm[x][1]) for x in range(0,len(newMatches))]

nextResults = pd.DataFrame(np.stack((patentData.instnm,nextName,nextScore))).T
nextResults.rename(columns={0:'unmatched'},inplace=True)
nextResults.rename(columns={1:'match'},inplace=True)
nextResults.rename(columns={2:'score'},inplace=True)
backupResults = nextResults.copy()        

# Keep matches
nextResults['keepMatch'] = 0
nextResults.loc[nextResults.score==100,'keepMatch'] = 1
nextResults.loc[nextResults.score==96,'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Arkansas State University','keepMatch'] = 0
nextResults.loc[nextResults.score==95,'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Of Texas At Dallas','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Art Institute Of Fort Lauderdale ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Of Texas At Arlington','keepMatch'] = 1
nextResults.loc[nextResults.score==93,'keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='College Of William & Mary','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Art Institute Of Seattle ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='College Of New Jersey ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Ohio University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Saint Olaf College','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Of Alabama','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Of Montana ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='College Of Wooster ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='University Of Tampa ','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Brigham Young University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Montana State University Bozeman','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Ohio University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Embry-Riddle Aeronautical University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='North Carolina Agricultural & Technical State University','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Wheaton College of Wheaton, IL','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Augustana College of Rock Island, IL','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Columbia College of Chicago, IL','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='ITT Technical Institute','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Pratt Institute','keepMatch'] = 1
nextResults.loc[nextResults.unmatched=='Spokane And Spokane Falls Community Colleges','keepMatch'] = 1

nextResults = nextResults[nextResults.keepMatch==1]

nextResults = nextResults[['unmatched','match']]

nextResults.rename(columns={'unmatched':'instnm'},inplace=True)
nextResults.rename(columns={'match':'school'},inplace=True)

patentData = pd.merge(patentData, nextResults, how='left', left_on = ['instnm'], right_on = ['instnm'])

missingSchool = patentData[patentData.school.isnull()]

# Fill in manually schools that didn't successfully fuzzymatch
patentData.loc[patentData.instnm=='North Carolina State University','school'] = 'North Carolina State University at Raleigh'
patentData.loc[patentData.instnm=='Purdue University','school'] = 'Purdue University-Main Campus'
patentData.loc[patentData.instnm=='Georgia Institute Of Technology','school'] = 'Georgia Institute of Technology-Main Campus'
patentData.loc[patentData.instnm=='University Of Virginia','school'] = 'University of Virginia-Main Campus'
patentData.loc[patentData.instnm=='Pennsylvania State University ','school'] = 'Pennsylvania State University-Main Campus'
patentData.loc[patentData.instnm=='State University Of New York At Buffalo','school'] = 'University at Buffalo'
patentData.loc[patentData.instnm=='University Of Cincinnati','school'] = 'University of Cincinnati-Main Campus'
patentData.loc[patentData.instnm=='Lawrence University Of Wisconsin','school'] = 'Lawrence University'
patentData.loc[patentData.instnm=='Texas A&M University','school'] = 'Texas A & M University-College Station'
patentData.loc[patentData.instnm=='University Of Akron ','school'] = 'University of Akron Main Campusa'
patentData.loc[patentData.instnm=='University Of New Hampshire','school'] = 'University of New Hampshire-Main Campus'
patentData.loc[patentData.instnm=='Colorado State University','school'] = 'Colorado State University-Fort Collins'
patentData.loc[patentData.instnm=='Rutgers, The State University Of New Jersey','school'] = 'Rutgers University-New Brunswick'
patentData.loc[patentData.instnm=='Bethel University of Saint Paul, MN','school'] = 'Bethel University'
patentData.loc[patentData.instnm=='Oklahoma State University','school'] = 'Oklahoma State University-Main Campus'
patentData.loc[patentData.instnm=='University Of Oklahoma','school'] = 'University of Oklahoma-Norman Campus'
patentData.loc[patentData.instnm=='Ohio State University ','school'] = 'Ohio State University-Main Campus'
patentData.loc[patentData.instnm=='University Of Vermont And State Agricultural College','school'] = 'University of Vermont'
patentData.loc[patentData.instnm=='State University Of New York At Stony Brook','school'] = 'Stony Brook University'
patentData.loc[patentData.instnm=='University Of South Florida','school'] = 'University of South Florida-Main Campus'
patentData.loc[patentData.instnm=='Furman University','school'] = 'Furman'
patentData.loc[patentData.instnm=='Miami University','school'] = 'Miami University-Oxford'
patentData.loc[patentData.instnm=='University Of New Mexico','school'] = 'University of New Mexico-Main Campus'
patentData.loc[patentData.instnm=='Richard Stockton College Of New Jersey ','school'] = 'Stockton University'
patentData.loc[patentData.instnm=='Wright State University','school'] = 'Wright State University-Main Campus'
patentData.loc[patentData.instnm=='Columbia College of Chicago, IL','school'] = 'Columbia College Chicago'
patentData.loc[patentData.instnm=='State University Of New York At Albany','school'] = 'SUNY at Albany'
patentData.loc[patentData.instnm=='Missouri State University','school'] = 'Missouri State University-Springfield'
patentData.loc[patentData.instnm=='New Mexico State University','school'] = 'New Mexico State University-Main Campus'
patentData.loc[patentData.instnm=='SUNY College At Buffalo','school'] = 'SUNY Buffalo State'
patentData.loc[patentData.instnm=='Arkansas State University','school'] = 'Arkansas State University-Main Campus'
patentData.loc[patentData.instnm=='Kent State University','school'] = 'Kent State University at Kent'
patentData.loc[patentData.instnm=='Bowling Green State University','school'] = 'Bowling Green State University-Main Campus'
patentData.loc[patentData.instnm=="Saint John's University of Queens, NY",'school'] = 'Saint Johns University'
patentData.loc[patentData.instnm=='Iowa State University Of Science & Technology','school'] = 'Iowa State University'
patentData.loc[patentData.instnm=='Union College of Schenectady, NY','school'] = 'Union College'
patentData.loc[patentData.instnm=='Pratt Institute','school'] = 'Pratt Institute-Main'
patentData.loc[patentData.instnm=='University Of Saint Thomas of Saint Paul, MN','school'] = 'University of St Thomas'

missingSchool = patentData[patentData.school.isnull()]

#------------------------------
# Add observations and new merging that cover multiple universities 
#------------------------------

# Setup

patentData.rename(columns={'school':'mergePatentSchool'},inplace=True)

df['mergePatentSchool'] = df.school

# Add individually mapping for each system

patentData.loc[patentData.instnm=="University Of Alaska System",'mergePatentSchool'] = 'University Of Alaska System'
df.loc[df.school=="University of Alaska Anchorage",'mergePatentSchool'] = 'University Of Alaska System'
df.loc[df.school=="University of Alaska Fairbanks",'mergePatentSchool'] = 'University Of Alaska System'
df.loc[df.school=="University of Alaska Southeast",'mergePatentSchool'] = 'University Of Alaska System'

patentData.loc[patentData.instnm=="University Of Colorado System",'mergePatentSchool'] = 'University Of Colorado System'
df.loc[df.school=="University of Colorado Boulder",'mergePatentSchool'] = 'University Of Colorado System'
df.loc[df.school=="University of Colorado Colorado Springs",'mergePatentSchool'] = 'University Of Colorado System'
df.loc[df.school=="University of Colorado Denver/Anschutz Medical Campus",'mergePatentSchool'] = 'University Of Colorado System'

patentData.loc[patentData.instnm=="University Of Hawaii And Hawaii Community Colleges",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'
df.loc[df.school=="University of Hawaii Maui College",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'
df.loc[df.school=="University of Hawaii System Office",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'
df.loc[df.school=="University of Hawaii at Hilo",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'
df.loc[df.school=="University of Hawaii at Manoa",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'
df.loc[df.school=="University of Hawaii-West Oahu",'mergePatentSchool'] = 'University Of Hawaii And Hawaii Community Colleges'

patentData.loc[patentData.instnm=="University Of Maine System",'mergePatentSchool'] = 'University Of Maine System'
df.loc[df.school=="University of Maine",'mergePatentSchool'] = 'University Of Maine System'
df.loc[df.school=="University of Maine at Augusta",'mergePatentSchool'] = 'University Of Maine System'
df.loc[df.school=="University of Maine at Machias",'mergePatentSchool'] = 'University Of Maine System'

patentData.loc[patentData.instnm=="University Of Massachusetts System",'mergePatentSchool'] = 'University Of Massachusetts System'
df.loc[df.school=="University of Massachusetts Medical School",'mergePatentSchool'] = 'University Of Massachusetts System'
df.loc[df.school=="University of Massachusetts-Amherst",'mergePatentSchool'] = 'University Of Massachusetts System'
df.loc[df.school=="University of Massachusetts-Boston",'mergePatentSchool'] = 'University Of Massachusetts System'
df.loc[df.school=="University of Massachusetts-Dartmouth",'mergePatentSchool'] = 'University Of Massachusetts System'
df.loc[df.school=="University of Massachusetts-Lowell",'mergePatentSchool'] = 'University Of Massachusetts System'

patentData.loc[patentData.instnm=="University Of Nebraska System",'mergePatentSchool'] = 'University Of Nebraska System'
df.loc[df.school=="University of Nebraska Medical Center",'mergePatentSchool'] = 'University Of Nebraska System'
df.loc[df.school=="University of Nebraska at Kearney",'mergePatentSchool'] = 'University Of Nebraska System'
df.loc[df.school=="University of Nebraska at Omaha",'mergePatentSchool'] = 'University Of Nebraska System'
df.loc[df.school=="University of Nebraska-Lincoln",'mergePatentSchool'] = 'University Of Nebraska System'

patentData.loc[patentData.instnm=="University Of Pittsburgh System",'mergePatentSchool'] = 'University Of Pittsburgh System'
df.loc[df.school=="University of Pittsburgh-Bradford",'mergePatentSchool'] = 'University Of Pittsburgh System'
df.loc[df.school=="University of Pittsburgh-Greensburg",'mergePatentSchool'] = 'University Of Pittsburgh System'
df.loc[df.school=="University of Pittsburgh-Johnstown",'mergePatentSchool'] = 'University Of Pittsburgh System'
df.loc[df.school=="University of Pittsburgh-Pittsburgh Campus",'mergePatentSchool'] = 'University Of Pittsburgh System'

patentData.loc[patentData.instnm=="University Of South Carolina System",'mergePatentSchool'] = 'University Of South Carolina System'
df.loc[df.school=="University of South Carolina-Aiken",'mergePatentSchool'] = 'University Of South Carolina System'
df.loc[df.school=="University of South Carolina-Beaufort",'mergePatentSchool'] = 'University Of South Carolina System'
df.loc[df.school=="University of South Carolina-Columbia",'mergePatentSchool'] = 'University Of South Carolina System'
df.loc[df.school=="University of South Carolina-Upstate",'mergePatentSchool'] = 'University Of South Carolina System'

patentData.loc[patentData.instnm=="University Of Washington System",'mergePatentSchool'] = 'University Of Washington System'
df.loc[df.school=="University of Washington-Bothell Campus",'mergePatentSchool'] = 'University Of Washington System'
df.loc[df.school=="University of Washington-Seattle Campus",'mergePatentSchool'] = 'University Of Washington System'

patentData.loc[patentData.instnm=="West Virginia University, West Virginia University Institute Of Technology, West Virginia University - Parkersburg, And Potomac State College",'mergePatentSchool'] = 'West Virginia University, West Virginia University Institute Of Technology, West Virginia University - Parkersburg, And Potomac State College'
df.loc[df.school=="West Virginia University",'mergePatentSchool'] = 'West Virginia University, West Virginia University Institute Of Technology, West Virginia University - Parkersburg, And Potomac State College'
df.loc[df.school=="West Virginia University Institute of Technology",'mergePatentSchool'] = 'West Virginia University, West Virginia University Institute Of Technology, West Virginia University - Parkersburg, And Potomac State College'
df.loc[df.school=="Potomac State College of West Virginia University",'mergePatentSchool'] = 'West Virginia University, West Virginia University Institute Of Technology, West Virginia University - Parkersburg, And Potomac State College'

patentData.loc[patentData.instnm=="Louisiana State University System",'mergePatentSchool'] = 'Louisiana State University System'
df.loc[df.school=="Louisiana State University and Agricultural & Mechanical College",'mergePatentSchool'] = 'Louisiana State University System'
df.loc[df.school=="Louisiana State University-Alexandria",'mergePatentSchool'] = 'Louisiana State University System'
df.loc[df.school=="Louisiana State University-Shreveport",'mergePatentSchool'] = 'Louisiana State University System'

patentData.loc[patentData.instnm=="Minnesota State University System, Century And Various Other Minnesota Community Colleges",'mergePatentSchool'] = 'Minnesota State University System, Century And Various Other Minnesota Community Colleges'
df.loc[df.school=="Minnesota State University Moorhead",'mergePatentSchool'] = 'Minnesota State University System, Century And Various Other Minnesota Community Colleges'
df.loc[df.school=="Minnesota State University-Mankato",'mergePatentSchool'] = 'Minnesota State University System, Century And Various Other Minnesota Community Colleges'

patentData.loc[patentData.instnm=="Devry University, Devry Institute Of Technology And Denver Technical College",'mergePatentSchool'] = 'Devry University, Devry Institute Of Technology And Denver Technical College'
df.loc[df.school=="DeVry University-Administrative Office",'mergePatentSchool'] = 'Devry University, Devry Institute Of Technology And Denver Technical College'

# Merge with GD data

missingSchool = patentData[patentData.mergePatentSchool.isnull()]

patentData = patentData[patentData.mergePatentSchool.notnull()]

del patentData['instnm']

df = pd.merge(df, patentData, how='left', left_on = ['mergePatentSchool'], right_on = ['mergePatentSchool'])

##############################################################################
# Save dataset
##############################################################################

allUniversities = df.groupby(['universityCountry','school']).size().reset_index()

del df['quality_of_education']
del df['quality_of_faculty']
del df['alumni_employment']
del df['research_performance']

df.loc[df.metro=='Puerto Rico','countryName'] = 'Puerto Rico'
df = df[df.countryName!='Puerto Rico']

df.to_csv(savePath + 'Salaries_international_dataset_main.csv')

#df = pd.read_csv(savePath + 'Salaries_international_dataset_main.csv')
#del df['Unnamed: 0']

print("Finished R_create_salaries_international.")

